### DESeq2 Practical 01-Feb-2020 ###

###################################################	
### code chunk number 1	
###################################################	

# load packages
# *install if missing*
library(DESeq2) 
library(umap)
library(tidyverse)
library(data.table)
library(fs) 
library(magrittr) 
library(MatrixGenerics)
library(ggrepel)
library(preprocessCore)
library(ggforce)

#to export figures in svg format
library(svglite)

# 
# Check paths
# Automated paths:	
#' 	

#path_input=paste0("/Users/yoshiko/Desktop/20230130-4-group-based/")
path_input=paste0("/Users/yoshiko/Desktop/20230130-4-group-based/")

setwd(path_input)
#path_output=paste0("/Users/yoshiko/Desktop/20230130-4-group-based/DESeq2_WT_4classes/")
path_output=paste0("/Users/yoshiko/Desktop/20230130-4-group-based/DESeq2_WT_4classes/")
dir.create(path_output)
#' 	


###################################################	
### code chunk number 2	
###################################################	
# mm10.ncbiRefSeq.gtfのdirectoryを変える
gtf <- fread("/Users/yoshiko/Desktop/20230130-4-group-based/mm10.ncbiRefSeq.gtf")
gtf <- gtf %>% filter(V1 =="chrY") %>% select(V9)
gtf_Y <- gtf$V9
gtf_Y_df <- unlist(str_split(gtf_Y,pattern = ";"))
gtf_Y_df <- gtf_Y_df[str_detect(gtf_Y_df,"gene_name")]
gtf_Y_df <- gsub(" ","", gtf_Y_df)
gtf_Y_df <- gsub("gene_name\\\"","",gtf_Y_df)
gtf_Y_df <- gsub("\\\"","",gtf_Y_df)

gtf_Y_df <- gtf_Y_df[!duplicated(gtf_Y_df)]

## Read in Data/Metadata
file1 = paste0(path_input, 'count.txt') 	
sex_related_genes <- c(gtf_Y_df,"Xist")

#readCounts <- fread(paste0(path_input,"Read.txt"))

rawCounts=fread(file1,header=T) %>% filter(!(Geneid %in% sex_related_genes)) %>% 
                    
                    tibble::column_to_rownames("Geneid") %>% select(c(-1,-2,-3,-4,-5)) %>%
                   rename("/exports/eddie/scratch/v1yikush/after_alignment/611_Aligned.sortedByCoord.out.bam"=
                            "/exports/eddie/scratch/v1yikush/after_alignment/119_Aligned.sortedByCoord.out.bam")
sample_name <- colnames(rawCounts)
sample_name <- gsub("^/.*/","",sample_name)
sample_name <- gsub("_Aligned.*$","",sample_name)
colnames(rawCounts) <- sample_name
rawCounts <- rawCounts %>% select(sort(sample_name))

sample_info <- fread("Sample_info.csv",sep="\t")

# Omit paried end sequence data because they differ too much from single-end data


samples <- colnames(rawCounts)
sex <- factor(sample_info$Sex)
diet <- factor(sample_info$Diet)
genotype <- factor(sample_info$Genotype)


sampleData <- data.frame(samples,sex,diet,genotype) %>% dplyr::filter(genotype=="WT") %>%
  mutate(class =paste(diet, sex,sep="_"))
rawCounts <- rawCounts %>% dplyr::select(sampleData$samples)



rawCounts_filt <- rawCounts %>% filter(rowSums(.) > 50) 
rawCounts_filt <- rawCounts_filt[!duplicated(rawCounts_filt),]


# sex+dietを適宜変える
DESeq2Object <- DESeqDataSetFromMatrix(countData=rawCounts_filt, 
                                       colData=sampleData, 
                                       design = ~class)



# Output normalized counts
dds <- DESeq2Object 
dds <- estimateSizeFactors(dds)
normalized_counts <- counts(dds, normalized=TRUE)
d <- data.frame(normalized_counts) %>% rownames_to_column("GeneName") %>% tibble()
fwrite(data.frame(d),paste0(path_output,"Counts.normalized_WT.txt"),sep="\t")



# Dimension and filtering of the data
dim(DESeq2Object)
DESeq2Object <- DESeq(DESeq2Object)

# Normalization and QC of the data
vst_out <- vst(DESeq2Object)



stdev <- apply(assay(vst_out),1,sd)
vst_short <- assay(vst_out)[rev(order(stdev))[1:500],]

#pcaData <- data.frame(prcomp(t(assay(vst_out)))$x)
#percentVar <- round(100 *  summary(prcomp(t(assay(vst_out))))$importance[2,])
pcaData <- data.frame(prcomp(t(vst_short))$x)
percentVar <- round(100 *  summary(prcomp(t(vst_short)))$importance[2,])


#pcaData=plotPCA(vst_out, intgroup = c("samples", "class"), returnData = TRUE, ntop=12000)
pcaData %>% head()
#percentVar <- round(100 * attr(prcomp(t(assay(vst_out))), "percentVar"))

#genotype -> sex等適宜変える
class <-paste(sampleData$diet, sampleData$sex,sep="_")

p1=ggplot(pcaData, aes(x = PC1, y = PC2, fill = class)) +
  geom_point(size=5, alpha=0.8,pch=21,color ="Black") +
  #geom_text_repel(aes(label=sampleData$samples),max.overlaps = 5)+
  geom_mark_ellipse(aes(color=class))+
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed() +
  ggtitle("PCA") +
  theme_bw()+
  theme(
    axis.text = element_text(size=15),
    axis.title = element_text(size = 15)
    
  )

p1
file_out = paste0(path_output,'PCA_1_WT_noID.svg')	
ggsave(filename =  file_out, plot = p1, width=8,height=7, dpi=600)	

#output the file showing P1 and P2 values for each sample
write.table(pcaData, "WT_pcaData.tsv", quote = FALSE, sep = "\t", col.names=NA)

#######Umap
#u <- umap(t(vst_short))
#u.df <- data.frame(u$layout)
#p1=ggplot(u.df, aes(x = X1, y = X2, fill = class)) +
#  geom_point(size=5, alpha=0.8,pch=21,color ="Black") +
#  geom_text_repel(aes(label=sampleData$samples),max.overlaps = 5)+
#  geom_mark_ellipse(aes(color=class))+
#  xlab(paste0("UMAP_1")) +
#  ylab(paste0("UMAP_2")) +
#  #xlim(c(-5,5))+
#  #ylim(c(-5,5))+
#  ggtitle("UMAP") +
#  theme_bw()+
#  theme(
#    axis.text = element_text(size=15),
#    axis.title = element_text(size = 15)
#    
#  )
#p1
#file_out = paste0(path_output,'UMAP_all_WT.png')	
#ggsave(filename =  file_out, plot = p1, width=9,height=7, dpi=600)

loadings <- prcomp(t(vst_short))$rotation
loadings <- data.frame(PC1=loadings[,1],PC2=loadings[,2]) %>% rownames_to_column("Gene")
fwrite(loadings,paste0(path_output,"PC_loadings_WT.tsv"),sep="\t")

PC1 <- loadings %>% arrange(PC1) %>% dplyr::filter(PC1>0.06 | PC1< -0.05) %>% select(Gene,PC1)
ggplot(rev(PC1),aes(x=PC1,y=reorder(Gene,PC1)))+
  geom_point(size=2)+
  xlab("PC1 loadings")+
  ylab("Gene")+
  theme_bw()+
  theme(panel.grid.major.x= element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(colour="grey60",linetype="dashed"),
        axis.title = element_text(size=14)
        )
ggsave(paste0(path_output,"PC1_loading_WT.svg"),width=5,height=10)


PC2 <- loadings %>% arrange(PC2) %>% dplyr::filter(PC2>0.04 | PC2< -0.045) %>% select(Gene,PC2)
ggplot(rev(PC2),aes(x=PC2,y=reorder(Gene,PC2)))+
  geom_point(size=2)+
  xlab("PC2 loadings")+
  ylab("Gene")+
  theme_bw()+
  theme(panel.grid.major.x= element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(colour="grey60",linetype="dashed"),
        axis.title = element_text(size=14)
  )
ggsave(paste0(path_output,"PC2_loading_WT.svg"),width=5,height=10)

# Save and explore the PCA plot
DESeq2Object <- DESeqDataSetFromMatrix(countData=rawCounts_filt, 
                                       colData=sampleData, 
                                       design = ~class)
dim(DESeq2Object)
DESeq2Object <- DESeq(DESeq2Object)

m <- my_DESeq2(DESeq2Object,sampleData,contrast="class","AL_Female","AL_Male")
m <- my_DESeq2(DESeq2Object,sampleData,contrast="class","CR_Female","CR_Male")
m <- my_DESeq2(DESeq2Object,sampleData,contrast="class","CR_Female","AL_Female")
m <- my_DESeq2(DESeq2Object,sampleData,contrast="class","CR_Male","AL_Male")


DESeq2Object <- DESeqDataSetFromMatrix(countData=rawCounts_filt, 
                                       colData=sampleData, 
                                       design = ~sex+diet)
DESeq2Object <- DESeq(DESeq2Object)
m <- my_DESeq2(DESeq2Object,sampleData,contrast="sex","Female","Male")
m <- my_DESeq2(DESeq2Object,sampleData,contrast="diet","CR","AL")
# Pipeline for DESeq2 analysis

my_DESeq2 <- function(countData,sampleData,contrast,group1,group2,FC=1.5,Padj=0.05){
  #print(class)
  #print(sampleData)
  DESeq2Object <- NULL
  DESeq2Results <- NULL
  vst_out <- NULL
  colnames(sampleData) <- c("samples","class")
  
 # DESeq2Object <- DESeqDataSetFromMatrix(countData=countData, 
 #                                        colData=sampleData, 
 #                                        design = ~class)
  #DESeq2Object <- DESeq(DESeq2Object)
  #  print(DESeq2Object)
  #vst_out <- vst(DESeq2Object, blind = TRUE)
  DESeq2Object <- countData
  print(DESeq2Object)
  my_contrast=c(contrast, group1, group2) # Get the results
  print(my_contrast)
  deseq2Results <- results(DESeq2Object, contrast=my_contrast)
  print(deseq2Results)
  deseq2Results %<>% as_tibble(rownames = 'geneID') %>%  dplyr::arrange(padj)
  print(head(deseq2Results))
  file_out=paste0(path_output, group1,"_vs_",group2,"_DE_Results.tsv")
  fwrite(deseq2Results,file_out, sep = '\t')
  
  
  gene_list <- fread("volcano_plot_genes.tsv",sep="\t") %>% .$Genes
  
  file_out=paste0(path_output, group1,"_vs_",group2) 
  plot_volcano(deseq2Results,gene_list=gene_list,file_out,FC=FC,Padj=Padj)
  #png(paste0(path_output,group1,"_vs_",group2,"_MAplots.png"))
  #DESeq2::plotMA(deseq2Results)
  #dev.off()

    
  return(deseq2Results)
  
}


plot_volcano = function(deseq2Results,gene_list=c(""),file_out,FC=1.5,Padj=0.05) {

  
  set.seed(42)
  if(length(gene_list) >0){
df <- data.frame(deseq2Results)  %>%
  mutate(sig = case_when(geneID %in% gene_list ~ "marked",
                         padj < Padj & log2FoldChange > log2(FC) & !(geneID %in% gene_list) ~ "up", 
                         padj < Padj & log2FoldChange < -log2(FC) & !(geneID %in% gene_list)~ "down", 
                         TRUE ~ "non")) %>%
  mutate(mark = geneID %in% gene_list)  %>%
  mutate(nudge_y = 30) %>% arrange( mark)
  col_values=  c("#33DFD4","#FFFF00","#CACACA","#F73719")
  }
  else {
    df <- data.frame(deseq2Results)  %>%
      mutate(sig = case_when(padj < Padj & log2FoldChange > log2(FC) & !(geneID %in% gene_list) ~ "up", 
                             padj < Padj & log2FoldChange < -log2(FC) & !(geneID %in% gene_list)~ "down", 
                             TRUE ~ "non")) %>%
      mutate(mark = geneID %in% gene_list) %>% arrange( mark)
    col_values= c("#33DFD4","#CACACA","#F73719")
  }
print(head(df))
print(df$geneID[duplicated(df$geneID)])
ggplot(df,aes(x= log2FoldChange,y= -log10(padj), fill=sig))+
  geom_point(pch=21,aes(size=mark),alpha=0.7)+
  scale_fill_manual(values= col_values)+
  scale_size_manual(values=c(1.5,3))+
  ylim(c(0,50))+
  xlim(c(-12,12))+
  geom_text_repel(data=filter(df,mark==TRUE & log2FoldChange > 0),
                  aes(x=log2FoldChange,label=geneID),
                  size=5,
                  max.overlaps = 25,
                  segment.color ="grey50",
                  segment.size  = 0.2,
                  nudge_x= 12- subset(df,mark==TRUE &
                                        log2FoldChange > 0)$log2FoldChange,
                  nudge_y = subset(df,mark==TRUE &
                                     log2FoldChange> 0)$nudge_y,
                  direction="y",
                  box.padding = 0.5, color="#000033")+
  geom_text_repel(data=filter(df,mark==TRUE & log2FoldChange <0),
                  aes(x=log2FoldChange,label=geneID),
                  size=5,
                  max.overlaps = 25,
                  segment.color = "grey50",
                  segment.size  = 0.2,
                  nudge_x= -12- subset(df,mark==TRUE &
                                         log2FoldChange< 0)$log2FoldChange,
                  nudge_y = subset(df,mark==TRUE &
                                     log2FoldChange< 0)$nudge_y,
                  direction="y",
                  box.padding = 0.5, color="#000033")+
  #geom_text_repel(data=filter(df,mark==TRUE),aes(x=log2FoldChange,y=-log10(padj),label=geneID),
  #                size=5,max.overlaps = 10,
  #                nudge_y=20,box.padding = 0.5,color="#0000CC")+
  geom_vline(xintercept = log2(FC),linetype="dashed")+
  geom_vline(xintercept=-log2(FC),linetype="dashed")+
  geom_hline(yintercept=-log10(Padj),linetype="dashed")+
  theme_bw()
ggsave(paste0(file_out,"_volcano.svg"),width=6,height=6)
ggsave(paste0(file_out,"_volcano.png"),width=6,height=6)


}

